rm(list=ls())
library(haven)
library(plyr)

# Read in data
repdata <- read_dta('data_from_hh/repdata.dta')
#Factors in experiment
factors<-c("FeatEd", "FeatGender", "FeatCountry", "FeatReason", 
           "FeatJob", "FeatExp", "FeatPlans", "FeatTrips", "FeatLang")
cov.list<-c("censusgroup", "Party_ID", "Ideology", "ppethm", "ppeducat", "W1_Q4")

###Get rid of NAs
repdata<-repdata[complete.cases(repdata[,c("Chosen_Immigrant", "CaseID", factors)]),]
#Filter out observations that were missing (NA) or negative ("Refused")
repdata <- repdata %>% filter_at(.vars = cov.list, all_vars(. > 0 & !is.na(.)))
repdata <- repdata %>% filter(!is.na(W1_Q5))


##Formatting factors correctly
fac<-c("Ed", "Gender", "Country", "Reason", 
       "Job", "Exp", "Plans", "Trips", "Lang")
repdata[,fac]<-repdata[,factors]

safe_convert <- function(i){
  attr_i <- attributes(i)
  i <- names(attr_i$labels)[match(i, attr_i$labels)]
  labels_i <- names(attr_i$labels[names(attr_i$labels) %in% i])
  fmt_i <- factor(i, levels = labels_i)
  return(fmt_i)
}
#Some weird thing about "labelled" --- turn into factors to make sure we can see
repdata[,fac] <-   as.data.frame(lapply(repdata[,fac], safe_convert))

#Relabel so easier to read outputs
repdata$Ed<-plyr::mapvalues(repdata$Ed, from = c("No formal education","Equivalent to completing fourth grade in the US",
                                                 "Equivalent to completing eighth grade in the US","Equivalent to completing high school in the US",
                                                 "Equivalent to completing two years of college in the US", "Equivalent to completing a college degree in the US",
                                                 "Equivalent to completing a graduate degree in the US"), 
                            
                            to = c("No formal", "Grade4", "Grade8", "HS", "2yCol", "Col", "GradDeg"))
repdata$Ed<-factor(repdata$Ed, levels=c("No formal", "Grade4", "Grade8", "HS", "2yCol", "Col", "GradDeg"))
repdata$Reason<-plyr::mapvalues(repdata$Reason, from = c("Reunite with family members already in the U.S.","Seek better job in U.S.",
                                                         "Escape political/religious persecution" ), 
                                to = c("Family", "Job", "Persecution"))
repdata$Lang<-plyr::mapvalues(repdata$Lang, from = c("During the admission interview, this applicant spoke fluent English", 
                                                     "During admission interview, this applicant spoke broken English",
                                                     "During admission interview, this applicant tried to speak English but was unable",
                                                     "During admission interview, this applicant spoke [language] and used an interpreter"), 
                              to = c("Fluent", "Broken", "Unable", "Interpreter"))

repdata$Exp<-plyr::mapvalues(repdata$Exp, from = c("More than five years of job training and experience", 
                                                   "Three to five years of job training and experience",
                                                   "One or two years of job training and experience",
                                                   "No job training or prior experience"   ), 
                             to = c(">5 years", "3-5 years", "1-2 years", "None"))
repdata$Plans<-plyr::mapvalues(repdata$Plans, from = c("Has a contract with a U.S. employer", 
                                                       "Does not have a contract with a U.S. employer but has done job interviews",
                                                       "Will look for work after arriving in the U.S.",
                                                       "Has no plans to look for work at this time"   ), 
                               to = c("Has contract", "No contract, had interviews", "Will look after arrival", "No plans"))

repdata$Trips<-plyr::mapvalues(repdata$Trips, from = c("Never been to the U.S.", 
                                                       "Entered U.S. once before on a tourist visa" ,
                                                       "Has visited the U.S. many times before on tourist visas" ,
                                                       "Spent six months with family members in the U.S",
                                                       "Entered the U.S. once before without legal authorization"), 
                               to = c("Never been", "Once with visa", "Multiple times with visa", "6 months with family", "Once w/o authorization"))
repdata$Gender<-plyr::mapvalues(repdata$Gender, from = c("female", "male"), 
                                to = c("Female", "Male"))


#Factorize covariates
cov.list2<-c("census_div", "party_ID", "ideology", "ppEthm", "ppEducat", "imm_change")
repdata[,cov.list2]<-repdata[,cov.list]
repdata[,c("ppgender")] <-lapply(repdata[,c("ppgender")], safe_convert)


repdata[,cov.list2] <-lapply(repdata[,cov.list2], safe_convert)
summary(repdata[,cov.list2])

repdata$ethno<-ifelse(repdata$W1_Q5>=50, "L", "H")
repdata$hisp_prej<-repdata$W1_Q5


repdata <- repdata %>% group_by(CaseID, contest_no) %>% dplyr::mutate(choice_id = 1:n()) %>% ungroup

################Ordered factors################

repdata$Ed<-factor(repdata$Ed, ordered = TRUE)
repdata$Exp<-factor(repdata$Exp, ordered = TRUE)

repdata$scale_hisp_prej<- as.vector(scale(repdata$W1_Q5))
repdata$scale_hisp_prej_flip<- -repdata$scale_hisp_prej

repdata_nonhisp<-subset(repdata, ppEthm != "Hispanic")
repdata_nonhisp$ppEthm<-droplevels(repdata_nonhisp$ppEthm)
repdata_nonhisp$scale_hisp_prej<- as.vector(scale(repdata_nonhisp$W1_Q5))
repdata_nonhisp$scale_hisp_prej_flip<- -repdata_nonhisp$scale_hisp_prej

saveRDS(repdata_nonhisp, 'code/packaged_data.RDS')
